import pandas as pd
pd.set_option('display.max_columns', None)
vehicle = pd.read_csv("vehicles.csv")
vehicle[:1000].to_csv("data_file.csv")
vehicle_2 = vehicle[vehicle["posting_date"].notna()]
vehicle_2["posting_date"] = vehicle_2["posting_date"].str[:-5]
vehicle_2["posting_date"] = pd.to_datetime(vehicle_2["posting_date"],format='%Y-%m-%dT%H:%M:%S')
df = pd.DataFrame(vehicle_2.resample('H', on='posting_date')["id"].count())
df = pd.DataFrame(list(df["id"]),list(df.index))
df.rename(columns={0:"Hourly_Transaction"},inplace=True)
df["hour"]=pd.to_datetime(df.index).strftime('%-H')
model1 = smf.ols('Hourly_Transaction ~ C(hour)', data=df).fit()
df['dehourfied_hourly_Transaction'] = model1.resid
df[:50]
| Hourly_Transaction | hour | dehourfied_hourly_Transaction | |
|---|---|---|---|
| 2021-04-04 00:00:00 | 1 | 0 | -70.419355 |
| 2021-04-04 01:00:00 | 10 | 1 | -39.483871 |
| 2021-04-04 02:00:00 | 5 | 2 | -30.741935 |
| 2021-04-04 03:00:00 | 63 | 3 | -9.580645 |
| 2021-04-04 04:00:00 | 69 | 4 | -35.129032 |
| 2021-04-04 05:00:00 | 89 | 5 | -33.258065 |
| 2021-04-04 06:00:00 | 91 | 6 | -137.580645 |
| 2021-04-04 07:00:00 | 133 | 7 | -340.483871 |
| 2021-04-04 08:00:00 | 282 | 8 | -555.419355 |
| 2021-04-04 09:00:00 | 365 | 9 | -832.870968 |
| 2021-04-04 10:00:00 | 561 | 10 | -801.580645 |
| 2021-04-04 11:00:00 | 403 | 11 | -964.677419 |
| 2021-04-04 12:00:00 | 355 | 12 | -853.741935 |
| 2021-04-04 13:00:00 | 324 | 13 | -831.451613 |
| 2021-04-04 14:00:00 | 272 | 14 | -759.548387 |
| 2021-04-04 15:00:00 | 211 | 15 | -730.193548 |
| 2021-04-04 16:00:00 | 115 | 16 | -765.806452 |
| 2021-04-04 17:00:00 | 85 | 17 | -648.870968 |
| 2021-04-04 18:00:00 | 86 | 18 | -463.903226 |
| 2021-04-04 19:00:00 | 99 | 19 | -317.774194 |
| 2021-04-04 20:00:00 | 74 | 20 | -261.741935 |
| 2021-04-04 21:00:00 | 131 | 21 | -154.322581 |
| 2021-04-04 22:00:00 | 58 | 22 | -133.032258 |
| 2021-04-04 23:00:00 | 29 | 23 | -85.516129 |
| 2021-04-05 00:00:00 | 32 | 0 | -39.419355 |
| 2021-04-05 01:00:00 | 14 | 1 | -35.483871 |
| 2021-04-05 02:00:00 | 16 | 2 | -19.741935 |
| 2021-04-05 03:00:00 | 76 | 3 | 3.419355 |
| 2021-04-05 04:00:00 | 101 | 4 | -3.129032 |
| 2021-04-05 05:00:00 | 71 | 5 | -51.258065 |
| 2021-04-05 06:00:00 | 107 | 6 | -121.580645 |
| 2021-04-05 07:00:00 | 219 | 7 | -254.483871 |
| 2021-04-05 08:00:00 | 433 | 8 | -404.419355 |
| 2021-04-05 09:00:00 | 634 | 9 | -563.870968 |
| 2021-04-05 10:00:00 | 950 | 10 | -412.580645 |
| 2021-04-05 11:00:00 | 856 | 11 | -511.677419 |
| 2021-04-05 12:00:00 | 738 | 12 | -470.741935 |
| 2021-04-05 13:00:00 | 665 | 13 | -490.451613 |
| 2021-04-05 14:00:00 | 680 | 14 | -351.548387 |
| 2021-04-05 15:00:00 | 507 | 15 | -434.193548 |
| 2021-04-05 16:00:00 | 374 | 16 | -506.806452 |
| 2021-04-05 17:00:00 | 289 | 17 | -444.870968 |
| 2021-04-05 18:00:00 | 181 | 18 | -368.903226 |
| 2021-04-05 19:00:00 | 179 | 19 | -237.774194 |
| 2021-04-05 20:00:00 | 162 | 20 | -173.741935 |
| 2021-04-05 21:00:00 | 163 | 21 | -122.322581 |
| 2021-04-05 22:00:00 | 124 | 22 | -67.032258 |
| 2021-04-05 23:00:00 | 51 | 23 | -63.516129 |
| 2021-04-06 00:00:00 | 28 | 0 | -43.419355 |
| 2021-04-06 01:00:00 | 14 | 1 | -35.483871 |
from mizani.formatters import comma_format
from plotnine import *
from plotnine.stats import *
from mizani.breaks import date_breaks
from mizani.formatters import date_format
vehicle_2 = vehicle[vehicle["posting_date"].notna()]
vehicle_2["posting_date"] = vehicle_2["posting_date"].str[:-5]
vehicle_2["posting_date"] = pd.to_datetime(vehicle_2["posting_date"],format='%Y-%m-%dT%H:%M:%S')
df = pd.DataFrame(vehicle_2.resample('H', on='posting_date')["id"].count())
df = pd.DataFrame(list(df["id"]),list(df.index))
df.rename(columns={0:"Hourly_Transaction"},inplace=True)
df["hour"]=pd.to_datetime(df.index).strftime('%-H')
model1 = smf.ols('Hourly_Transaction ~ C(hour)', data=df).fit()
df['dehourfied_hourly_Transaction'] = model1.resid
b = (ggplot(df, aes(df.index, df["dehourfied_hourly_Transaction"]))) + geom_point(size=0.5) + \
geom_line(aes(group = 1)) \
+ theme(figure_size=(10, 8))\
+ theme(text=element_text(size=10))\
+ theme(axis_text_x=element_text(rotation=45, hjust=1))\
+ labs(x='Date', y='Dehourfied Hourly Secondhand Transaction Posting Number') \
+ ggtitle("Dehourfied Hourly Secondhand Transaction Posting Number")\
+ scale_x_datetime(breaks=date_breaks('1 day'), labels=(date_format('%Y-%m-%d %H')))
b
<ggplot: (1457514288261)>
from mizani.formatters import comma_format
from plotnine import *
from plotnine.stats import *
from mizani.breaks import date_breaks
from mizani.formatters import date_format
vehicle_2 = vehicle[vehicle["posting_date"].notna()]
vehicle_2["posting_date"] = vehicle_2["posting_date"].str[:-5]
vehicle_2["posting_date"] = pd.to_datetime(vehicle_2["posting_date"],format='%Y-%m-%dT%H:%M:%S')
df = pd.DataFrame(vehicle_2.resample('H', on='posting_date')["id"].count())
df = pd.DataFrame(list(df["id"]),list(df.index))
b = (ggplot(df, aes(df.index, df[0]))) + geom_point(size=0.5) + geom_line(aes(group = 1)) \
+ theme(figure_size=(10, 8))\
+ theme(text=element_text(size=10))\
+ theme(axis_text_x=element_text(rotation=45, hjust=1))\
+ labs(x='Date', y='Hourly Secondhand Transaction Posting Number') \
+ ggtitle("Hourly Secondhand Transaction Posting Number")\
+ scale_x_datetime(breaks=date_breaks('1 day'), labels=(date_format('%Y-%m-%d %H')))
b
<ggplot: (1457515941678)>
from mizani.formatters import comma_format
from plotnine import *
from plotnine.stats import *
from mizani.breaks import date_breaks
from mizani.formatters import date_format
df_1 = vehicle_2.groupby([vehicle_2['posting_date'].dt.date])["id"].count()
df_1 = pd.DataFrame(list(df_1),list(df_1.index))
b = (ggplot(df_1, aes(df_1.index, df_1[0]))) + geom_point(size=0.5) + geom_line(aes(group = 1)) \
+ theme(figure_size=(10, 8))\
+ theme(text=element_text(size=10))\
+ theme(axis_text_x=element_text(rotation=45, hjust=1))\
+ labs(x='Date', y='Daily Secondhand Transaction Posting Number') \
+ ggtitle("Daily Secondhand Transaction Posting Number")\
+ scale_x_datetime(breaks=date_breaks('1 day'), labels=(date_format('%Y-%m-%d')))
b
<ggplot: (1457518854919)>
import statsmodels.api as sm
import statsmodels.formula.api as smf
df_1 = vehicle_2.groupby([vehicle_2['posting_date'].dt.date])["id"].count()
df_1 = pd.DataFrame(list(df_1),list(df_1.index))
df_1["weekday"]=pd.to_datetime(df_1.index).strftime('%A')
df_1.rename(columns={0:"daily_transaction"},inplace=True)
model1 = smf.ols('daily_transaction ~ C(weekday)', data=df_1).fit()
df_1['deweekfied_daily_transaction'] = model1.resid
df_1
| daily_transaction | weekday | deweekfied_daily_transaction | |
|---|---|---|---|
| 2021-04-04 | 3911 | Sunday | -4389.20 |
| 2021-04-05 | 7622 | Monday | -8198.60 |
| 2021-04-06 | 7041 | Tuesday | -7959.20 |
| 2021-04-07 | 7780 | Wednesday | -4825.25 |
| 2021-04-08 | 8897 | Thursday | -4998.00 |
| 2021-04-09 | 9288 | Friday | -7638.50 |
| 2021-04-10 | 7350 | Saturday | -7025.00 |
| 2021-04-11 | 5127 | Sunday | -3173.20 |
| 2021-04-12 | 8803 | Monday | -7017.60 |
| 2021-04-13 | 8464 | Tuesday | -6536.20 |
| 2021-04-14 | 10214 | Wednesday | -2391.25 |
| 2021-04-15 | 10417 | Thursday | -3478.00 |
| 2021-04-16 | 11404 | Friday | -5522.50 |
| 2021-04-17 | 10166 | Saturday | -4209.00 |
| 2021-04-18 | 6257 | Sunday | -2043.20 |
| 2021-04-19 | 10910 | Monday | -4910.60 |
| 2021-04-20 | 11157 | Tuesday | -3843.20 |
| 2021-04-21 | 12943 | Wednesday | 337.75 |
| 2021-04-22 | 14920 | Thursday | 1025.00 |
| 2021-04-23 | 18898 | Friday | 1971.50 |
| 2021-04-24 | 13973 | Saturday | -402.00 |
| 2021-04-25 | 8430 | Sunday | 129.80 |
| 2021-04-26 | 16064 | Monday | 243.40 |
| 2021-04-27 | 17987 | Tuesday | 2986.80 |
| 2021-04-28 | 19484 | Wednesday | 6878.75 |
| 2021-04-29 | 21346 | Thursday | 7451.00 |
| 2021-04-30 | 28116 | Friday | 11189.50 |
| 2021-05-01 | 26011 | Saturday | 11636.00 |
| 2021-05-02 | 17776 | Sunday | 9475.80 |
| 2021-05-03 | 35704 | Monday | 19883.40 |
| 2021-05-04 | 30352 | Tuesday | 15351.80 |
from mizani.formatters import comma_format
from plotnine import *
from plotnine.stats import *
from mizani.breaks import date_breaks
from mizani.formatters import date_format
import statsmodels.api as sm
import statsmodels.formula.api as smf
df_1 = vehicle_2.groupby([vehicle_2['posting_date'].dt.date])["id"].count()
df_1 = pd.DataFrame(list(df_1),list(df_1.index))
df_1["weekday"]=pd.to_datetime(df_1.index).strftime('%A')
df_1.rename(columns={0:"daily_transaction"},inplace=True)
model1 = smf.ols('daily_transaction ~ C(weekday)', data=df_1).fit()
df_1['deweekfied_daily_transaction'] = model1.resid
b = (ggplot(df_1, aes(df_1.index, df_1['deweekfied_daily_transaction']))) + geom_point(size=0.5) + geom_line(aes(group = 1)) \
+ theme(figure_size=(10, 8))\
+ theme(text=element_text(size=10))\
+ theme(axis_text_x=element_text(rotation=45, hjust=1))\
+ labs(x='Date', y='Deweekdayfied Daily Secondhand Transaction Posting Number') \
+ ggtitle("Deweekdayfied Daily Secondhand Transaction Posting Number")\
+ scale_x_datetime(breaks=date_breaks('1 day'), labels=(date_format('%Y-%m-%d')))
b
<ggplot: (1457517091291)>
vehicle_1 = vehicle[vehicle["model"].notna() & vehicle["year"].notna() & vehicle["manufacturer"].notna()\
& vehicle["price"].notna()\
& vehicle["odometer"].notna() & vehicle["price"]!=0]
vehicle_1["age"] = 2021-vehicle_1["year"].astype(int)
/tmp/ipykernel_130750/2315890347.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy vehicle_1["age"] = 2021-vehicle_1["year"].astype(int)
from collections import Counter
model = "ford"
list_1 = ["SUV","convertible","coupe","hatchback","pickup","sedan","truck"]
vehicle_2 = vehicle_1[vehicle_1["manufacturer"]==model]
vehicle_2 = vehicle_2[vehicle_2["type"].isin(list_1)]
vehicle_3 = vehicle_2[vehicle_2["model"].isin([x[0] for x in Counter(list(vehicle_2["model"])).most_common()][:10])]
vehicle_3_df = vehicle_3.pivot_table(index='model',columns='type',values='id',aggfunc=lambda x: len(x.unique())).fillna(0)
from statsmodels.graphics.mosaicplot import mosaic
t1 = mosaic(vehicle_2, ['model','type'],title="Models and Types of Cars Manufactured by "+model.capitalize(),\
properties=props)
vehicle_2_df
| type | SUV | convertible | coupe | hatchback | other | pickup | sedan | truck | van | wagon |
|---|---|---|---|---|---|---|---|---|---|---|
| model | ||||||||||
| edge | 400.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 27.0 | 0.0 | 0.0 | 4.0 |
| escape | 917.0 | 0.0 | 0.0 | 0.0 | 17.0 | 0.0 | 5.0 | 1.0 | 0.0 | 25.0 |
| expedition | 281.0 | 0.0 | 0.0 | 0.0 | 17.0 | 0.0 | 1.0 | 1.0 | 0.0 | 14.0 |
| explorer | 736.0 | 0.0 | 0.0 | 0.0 | 19.0 | 2.0 | 0.0 | 2.0 | 0.0 | 14.0 |
| f-150 | 2.0 | 0.0 | 0.0 | 1.0 | 40.0 | 1420.0 | 0.0 | 1246.0 | 1.0 | 0.0 |
| f-250 | 0.0 | 0.0 | 1.0 | 0.0 | 8.0 | 369.0 | 0.0 | 55.0 | 0.0 | 0.0 |
| f150 | 0.0 | 0.0 | 0.0 | 0.0 | 26.0 | 120.0 | 0.0 | 84.0 | 0.0 | 0.0 |
| focus | 0.0 | 0.0 | 8.0 | 191.0 | 37.0 | 0.0 | 389.0 | 0.0 | 0.0 | 2.0 |
| fusion | 3.0 | 0.0 | 0.0 | 1.0 | 26.0 | 0.0 | 631.0 | 0.0 | 0.0 | 0.0 |
| mustang | 7.0 | 120.0 | 476.0 | 0.0 | 97.0 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 |
import seaborn as sns
sns.histplot(vehicle_1["age"])
<AxesSubplot: xlabel='age', ylabel='Count'>
df = vehicle_1[vehicle_1["model"]=='f-150']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
df.shape
/tmp/ipykernel_1568744/3826991033.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1568744/3826991033.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1568744/3826991033.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
(3430, 27)
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import warnings
warnings.filterwarnings('ignore')
fig, axs = plt.subplots(4, 4, figsize=(20, 20))
model = "nissan"
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Correlation Matrix of Cars Manufactured by "+model.capitalize(),\
fontsize=20)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
df["log(price)"] = np.log(df["price"])
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][0:16]
for i in range(len(car_list)):
matrix_1 = df[df["model"]==car_list[i]]\
[["log(price)", "age", "odometer"]].corr()
g1 = sns.heatmap(matrix_1,annot=True,ax=axs[i//4,i%4], vmin=-1, vmax=1)
g1.set_title(car_list[i], fontsize=15)
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
fig, axs = plt.subplots(4, 4, figsize=(20, 20))
model = "ford"
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Correlation Matrix of Cars Manufactured by "+model.capitalize(),\
fontsize=20)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
df["log(price)"] = np.log(df["price"])
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][0:16]
for i in range(len(car_list)):
matrix_1 = df[df["model"]==car_list[i]]\
[["price", "age", "odometer"]].corr()
g1 = sns.heatmap(matrix_1,annot=True,ax=axs[i//4,i%4], vmin=-1, vmax=1)
g1.set_title(car_list[i], fontsize=15)
/tmp/ipykernel_2639724/1030447802.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_2639724/1030447802.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_2639724/1030447802.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float) /tmp/ipykernel_2639724/1030447802.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["log(price)"] = np.log(df["price"])
Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]["manufacturer"])).most_common()[:20]
[('ford', 25651),
('chevrolet', 18388),
('toyota', 11116),
('honda', 7005),
('nissan', 6750),
('ram', 6604),
('jeep', 6560),
('gmc', 5477),
('dodge', 4679),
('bmw', 4401),
('hyundai', 3558),
('mercedes-benz', 3478),
('subaru', 3394),
('kia', 2952),
('volkswagen', 2559),
('lexus', 2320),
('chrysler', 2134),
('cadillac', 2086),
('audi', 2019),
('mazda', 1602)]
[x[0] for x in Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]["manufacturer"])).most_common()][:5]
['ford', 'chevrolet', 'toyota', 'honda', 'nissan']
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
manu_list = [x[0] for x in Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]["manufacturer"])).most_common()][:5]
n=4
fig, axs = plt.subplots(5, 2, figsize=(8*2, 8*5))
for i in range(len(manu_list)):
model=manu_list[i]
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
df=df[["model","manufacturer","condition","type","price","age","odometer"]]
df = df[(df["odometer"]<=300000) & (df["odometer"]>0) & (df["age"]<40)]
df=df[df["condition"].isin([x[0] for x in Counter(list(df["condition"])).most_common()][0:6])]
g = sns.violinplot(data=df, x="condition", y="odometer",ylim=[-20000,3000],ax=axs[i,0])
g.set_title("Mileage Distribution and Conditions of\n Vehicles Manufactured by "+model.capitalize(),fontsize=20)
df=df[df["type"].isin([x[0] for x in Counter(list(df["type"])).most_common()][1:6])]
g = sns.violinplot(data=df, x="condition", y="age",ylim=[-20000,3000],ax=axs[i,1])
g.set_title("Age Distribution and Conditions of\n Vehicles Manufactured by "+model.capitalize(),fontsize=20)
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
manu_list = [x[0] for x in Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]["manufacturer"])).most_common()][:5]
n=4
fig, axs = plt.subplots(5, 2, figsize=(8*2, 8*5))
for i in range(len(manu_list)):
model=manu_list[i]
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
df=df[["model","manufacturer","condition","type","price","age","odometer"]]
df = df[(df["odometer"]<=300000) & (df["odometer"]>0) & (df["age"]<40)]
df=df[df["type"].isin([x[0] for x in Counter(list(df["type"])).most_common()][0:6])]
g = sns.violinplot(data=df, x="type", y="odometer",ylim=[-20000,3000],ax=axs[i,0])
g.set_title("Mileage Distribution and Types of\n Vehicles Manufactured by "+model.capitalize(),fontsize=20)
df=df[df["type"].isin([x[0] for x in Counter(list(df["type"])).most_common()][1:6])]
g = sns.violinplot(data=df, x="type", y="age",ylim=[-20000,3000],ax=axs[i,1])
g.set_title("Age Distribution and Types of\n Vehicles Manufactured by "+model.capitalize(),fontsize=20)
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
model="bmw"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Age of Cars Manufactured by "+\
model.capitalize()+" of Different Models", fontsize=20)
print("Price and Mileage of Cars Manufactured by "+\
model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
g1 = sns.regplot(x="age",y="price",data=df_1,ax=axs[i//n,i%n],\
scatter_kws={'alpha':0.1},lowess=True,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2,40])
g1.set_ylim([0,80000])
g1.set_xlabel("Age")
g1.set_ylabel("Price")
Price and Mileage of Cars Manufactured by Bmw of Different Models
/tmp/ipykernel_2639724/8670581.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_2639724/8670581.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_2639724/8670581.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
df_1 = vehicle_1[vehicle_1["model"]=="f-150"]
df_1["log_price"] = np.log(df_1["price"])
model1 = sm.RLM(df_2["log_price"], sm.add_constant(df_2["age"]), M=sm.robust.norms.HuberT(t=2)).fit()
model1.params["age"]
-0.1308455505353119
import sklearn.decomposition as skd
import sklearn.preprocessing as skp
df_1[["price","age","odometer"]] = df_1[["price","age","odometer"]].astype(float)
df_1[["price","age","odometer"]] = skp.scale(df_1[["price","age","odometer"]])
df_1["log price"] = np.log(df_1["price"])
df_1 = df_1[["model","price","log price","age","odometer"]]
pca_model = skd.PCA().fit(df_1[["age","odometer"]])
pca_model.components_
array([[ 0.70710678, 0.70710678],
[-0.70710678, 0.70710678]])
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
model=""
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Age of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=30)
print("Price and Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["age"]>=2) & (df_1["age"]<=15)]
model1 = smf.ols('log_price ~ age', data=df_2).fit()
g1 = sns.regplot(x="age",y=np.log(df_1["price"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=False,\
robust=True,ci=None,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2,30])
g1.set_ylim([7.5,12.5])
g1.set_xlabel("Age", fontsize=10)
g1.set_ylabel("Logged Price", fontsize=10)
g1.text(10,12,"y="+str(round(model1.params["age"],3))+"x+"+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(10,11.5,"age p-value: "+str(round(model1.pvalues["age"],3)), fontsize=15)
try:
g1.text(10,11,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
except:
pass
g1.text(10,10.5,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
Price and Age of Cars Manufactured by Ford of Different Models
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
model="ford"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Age of Cars Manufactured by "+model.capitalize()+\
" of Different Models\n Robust Regression Huber Method", fontsize=30)
print("Price and Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["age"]>=1) & (df_1["age"]<=np.quantile(list(df_1["age"]), .95))]
data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)
try:
model1 = sm.RLM(df_2["log_price"], sm.add_constant(df_2["age"]), M=sm.robust.norms.HuberT(t=2)).fit()
except:
pass
g1 = sns.regplot(x="age",y=np.log(df_1["price"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=False,\
robust=True,ci=None,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2,30])
g1.set_ylim([7.5,12.5])
g1.set_xlabel("Age", fontsize=10)
g1.set_ylabel("Logged Price", fontsize=10)
try:
g1.text(10,12,"y="+str(round(model1.params["age"],3))+"x+"+str(round(model1.params["const"],2)), fontsize=15)
except:
pass
Price and Age of Cars Manufactured by Ford of Different Models
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
manufacturer_list = []
model_list = []
dict_age_depre_rate = {}
age_const = []
dict_mileage_depre_rate = {}
mileage_const = []
for model in [x[0] for x in Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]\
["manufacturer"])).\
most_common()][:20]:
n = 4
print("Price and Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["age"]>=2) & (df_1["age"]<=15) & (df_1["age"]<=np.quantile\
(list(df_1["age"]), .90))]
df_3 = df_1[(df_1["odometer"]>=8000) & (df_1["odometer"]<=300000)\
& (df_1["odometer"]<=np.quantile(list(df_1["odometer"]), .85))]
try:
model1 = sm.RLM(df_2["log_price"], sm.add_constant(df_2["age"]), \
M=sm.robust.norms.HuberT(t=2)).fit()
model2 = sm.RLM(df_3["log_price"], sm.add_constant(df_3["odometer"]), \
M=sm.robust.norms.HuberT(t=2)).fit()
age_const.append(round(model1.params["const"],4))
mileage_const.append(round(model2.params["const"],4))
dict_age_depre_rate[model+" "+car_list[i]] = round(model1.params["age"],4)
dict_mileage_depre_rate[model+" "+car_list[i]] = round(model2.params["odometer"]\
*10**6,3)
manufacturer_list.append(model)
model_list.append(car_list[i])
except:
pass
Price and Age of Cars Manufactured by Ford of Different Models Price and Age of Cars Manufactured by Chevrolet of Different Models Price and Age of Cars Manufactured by Toyota of Different Models Price and Age of Cars Manufactured by Honda of Different Models Price and Age of Cars Manufactured by Nissan of Different Models Price and Age of Cars Manufactured by Ram of Different Models Price and Age of Cars Manufactured by Jeep of Different Models Price and Age of Cars Manufactured by Gmc of Different Models Price and Age of Cars Manufactured by Dodge of Different Models Price and Age of Cars Manufactured by Bmw of Different Models Price and Age of Cars Manufactured by Hyundai of Different Models Price and Age of Cars Manufactured by Mercedes-benz of Different Models Price and Age of Cars Manufactured by Subaru of Different Models Price and Age of Cars Manufactured by Kia of Different Models Price and Age of Cars Manufactured by Volkswagen of Different Models Price and Age of Cars Manufactured by Lexus of Different Models Price and Age of Cars Manufactured by Chrysler of Different Models Price and Age of Cars Manufactured by Cadillac of Different Models Price and Age of Cars Manufactured by Audi of Different Models Price and Age of Cars Manufactured by Mazda of Different Models
print(len(dict_age_depre_rate.keys()),\
len(dict_age_depre_rate.values()),len(dict_mileage_depre_rate.values()),\
len(age_const),len(mileage_const),len(manufacturer_list),len(model_list))
310 310 310 310 310 310 310
df_4 = pd.DataFrame({"model and manufacturer":dict_age_depre_rate.keys(),\
"age_depre_rate":dict_age_depre_rate.values(),\
"mileage_depre_rate (*10**6)":dict_mileage_depre_rate.values(),\
"age_const":age_const,\
"mileage_const":mileage_const,\
"model":model_list,\
"manufacturer":manufacturer_list})
df_4["age_0_price"]=np.exp(df_4["age_const"])
df_4["mileage_0_price"]=np.exp(df_4["mileage_const"])
df_4.to_csv("car_depre_rate.csv")
df_4
| model and manufacturer | age_depre_rate | mileage_depre_rate (*10**6) | age_const | mileage_const | model | manufacturer | age_0_price | mileage_0_price | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ford f-150 | -0.1308 | -7.364 | 10.9655 | 10.7615 | f-150 | ford | 57843.710160 | 47169.369045 |
| 1 | ford escape | -0.1191 | -8.536 | 10.1533 | 10.1316 | escape | ford | 25675.692341 | 25124.531540 |
| 2 | ford explorer | -0.1531 | -10.454 | 10.7180 | 10.7270 | explorer | ford | 45161.489480 | 45569.777425 |
| 3 | ford fusion | -0.1274 | -8.031 | 10.0453 | 9.9310 | fusion | ford | 23047.210003 | 20557.888194 |
| 4 | ford mustang | -0.0910 | -8.584 | 10.5313 | 10.4475 | mustang | ford | 37470.153935 | 34458.121634 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 305 | mazda mazda5 | -0.0975 | -9.329 | 9.8407 | 9.8351 | mazda5 | mazda | 18782.859421 | 18677.969374 |
| 306 | mazda mazda2 | -0.1571 | -4.706 | 10.0953 | 9.2187 | mazda2 | mazda | 24228.865728 | 10083.946673 |
| 307 | mazda cx-5 grand touring | 0.2177 | 19.387 | 7.8597 | 7.8765 | cx-5 grand touring | mazda | 2590.743036 | 2634.635181 |
| 308 | mazda cx-5 touring | -0.0620 | -2.453 | 9.8803 | 9.6496 | cx-5 touring | mazda | 19541.584258 | 15515.580631 |
| 309 | mazda 2 | 0.0149 | -1.589 | 8.4863 | 8.8040 | 2 | mazda | 4847.895635 | 6660.834127 |
310 rows × 9 columns
vehicle["model and manufacturer"] = vehicle["manufacturer"]+" "+vehicle["model"]
df_5 = vehicle[vehicle["model and manufacturer"].isin(df_4["model and manufacturer"])].\
pivot_table(index='model and manufacturer',columns='type',values='id',aggfunc=lambda x: len(x.unique())).fillna(0)
df_6 = (df_5.div(df_5.sum(axis=1), axis=0)*100).round(2).astype(str) + '%'
vehicle["model and manufacturer"] = vehicle["manufacturer"]+" "+vehicle["model"]
df_5 = vehicle[vehicle["model and manufacturer"].isin(df_4["model and manufacturer"])].\
pivot_table(index='model and manufacturer',columns='type',values='id',aggfunc=lambda x: len(x.unique())).fillna(0)
df_6 = (df_5.div(df_5.sum(axis=1), axis=0)*100).round(2).astype(str) + '%'
df_6["types"] = np.empty((len(df_6), 0)).tolist()
df_6.insert(len(df_6.columns),"types_str", "")
for i in range(len(df_6.columns)-2):
for j in range(len(df_6.index)):
if float(df_6.iloc[j,i][:-1])>=25:
df_6["types"][j].append(df_6.columns[i])
for j in range(len(df_6.index)):
df_6["types_str"][j] = ", ".join(df_6["types"][j])
df_4 = df_4.merge(df_6[["types","types_str"]],right_on="model and manufacturer",left_on=df_6.index)
df_4
| model and manufacturer | age_depre_rate | mileage_depre_rate (*10**6) | age_const | mileage_const | model | manufacturer | age_0_price | mileage_0_price | types | types_str | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ford f-150 | -0.1308 | -7.364 | 10.9655 | 10.7615 | f-150 | ford | 57843.710160 | 47169.369045 | [sedan] | sedan |
| 1 | ford escape | -0.1191 | -8.536 | 10.1533 | 10.1316 | escape | ford | 25675.692341 | 25124.531540 | [sedan] | sedan |
| 2 | ford explorer | -0.1531 | -10.454 | 10.7180 | 10.7270 | explorer | ford | 45161.489480 | 45569.777425 | [coupe] | coupe |
| 3 | ford fusion | -0.1274 | -8.031 | 10.0453 | 9.9310 | fusion | ford | 23047.210003 | 20557.888194 | [sedan] | sedan |
| 4 | ford mustang | -0.0910 | -8.584 | 10.5313 | 10.4475 | mustang | ford | 37470.153935 | 34458.121634 | [hatchback, sedan] | hatchback, sedan |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 305 | mazda mazda5 | -0.0975 | -9.329 | 9.8407 | 9.8351 | mazda5 | mazda | 18782.859421 | 18677.969374 | [sedan] | sedan |
| 306 | mazda mazda2 | -0.1571 | -4.706 | 10.0953 | 9.2187 | mazda2 | mazda | 24228.865728 | 10083.946673 | [hatchback] | hatchback |
| 307 | mazda cx-5 grand touring | 0.2177 | 19.387 | 7.8597 | 7.8765 | cx-5 grand touring | mazda | 2590.743036 | 2634.635181 | [mini-van, van] | mini-van, van |
| 308 | mazda cx-5 touring | -0.0620 | -2.453 | 9.8803 | 9.6496 | cx-5 touring | mazda | 19541.584258 | 15515.580631 | [SUV] | SUV |
| 309 | mazda 2 | 0.0149 | -1.589 | 8.4863 | 8.8040 | 2 | mazda | 4847.895635 | 6660.834127 | [SUV] | SUV |
310 rows × 11 columns
g = sns.regplot(x="age_depre_rate",y="mileage_depre_rate (*10**6)",\
data=df_4,scatter_kws={'alpha':0.3},lowess=False,\
robust=True,ci=None,\
line_kws={"color": "red"})
g.set_xlim([-0.2,0])
g.set_ylim([-20,0])
g.set_title("Age and Mileage Depreciation Rate for 310 Models of Cars", fontsize=15)
g.set_xlabel("Age Depreciation Rate", fontsize=10)
g.set_ylabel("Mileage Depreciation Rate", fontsize=10)
Text(0, 0.5, 'Mileage Depreciation Rate')
import numpy as np
import pylab
import scipy.stats as stats
g2 = stats.probplot(df_4["mileage_depre_rate (*10**6)"], dist="norm", plot=pylab)
pylab.show()
import numpy as np
import pylab
import scipy.stats as stats
stats.probplot(df_4["age_depre_rate"], dist="norm", plot=pylab)
pylab.show()
import numpy as np
import pylab
import scipy.stats as stats
stats.probplot(df_4["age_const"], dist="norm", plot=pylab)
pylab.show()
import numpy as np
import pylab
import scipy.stats as stats
stats.probplot(df_4["mileage_const"], dist="norm", plot=pylab)
pylab.show()
df_4['age_0_price'].sort_values(['age_0_price'])
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [60], in <cell line: 1>() ----> 1 df_4['age_0_price'].sort_values(['age_0_price']) File ~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/.local/lib/python3.10/site-packages/pandas/core/series.py:3744, in Series.sort_values(self, axis, ascending, inplace, kind, na_position, ignore_index, key) 3742 inplace = validate_bool_kwarg(inplace, "inplace") 3743 # Validate the axis parameter -> 3744 self._get_axis_number(axis) 3746 # GH 5856/5853 3747 if inplace and self._is_cached: File ~/.local/lib/python3.10/site-packages/pandas/core/generic.py:554, in NDFrame._get_axis_number(cls, axis) 550 @final 551 @classmethod 552 def _get_axis_number(cls, axis: Axis) -> int: 553 try: --> 554 return cls._AXIS_TO_AXIS_NUMBER[axis] 555 except KeyError: 556 raise ValueError(f"No axis named {axis} for object type {cls.__name__}") TypeError: unhashable type: 'list'
import numpy as np
import pylab
import scipy.stats as stats
stats.probplot(df_4[df_4['age_0_price']!=df_4['age_0_price'].max()]["age_0_price"], dist="norm", plot=pylab)
pylab.show()
set(list(vehicle["type"]))
{'SUV',
'bus',
'convertible',
'coupe',
'hatchback',
'mini-van',
nan,
'offroad',
'other',
'pickup',
'sedan',
'truck',
'van',
'wagon'}
from collections import Counter
Counter(list(vehicle["type"])).most_common()
[(nan, 92858),
('sedan', 87056),
('SUV', 77284),
('pickup', 43510),
('truck', 35279),
('other', 22110),
('coupe', 19204),
('hatchback', 16598),
('wagon', 10751),
('van', 8548),
('convertible', 7731),
('mini-van', 4825),
('offroad', 609),
('bus', 517)]
from collections import Counter
n = 3
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Age and Mileage Depreciation Rate for 310 Models", fontsize=30)
type_list = [x[0] for x in Counter(list(vehicle["type"])).most_common()]
type_list = [x for x in type_list if str(x) != 'nan'][:n**2]
for i in range(len(type_list)):
car_type = type_list[i]
new_df = df_4[df_4["types_str"].str.contains(car_type,case=False)]
g = sns.histplot(x="age_depre_rate",\
data=new_df,ax=axs[i//n,i%n],kde=True)
g.set_xlim([-0.2,0])
g.set_title("Age Depreciation Rate\n for "+car_type.capitalize(), fontsize=10)
g.set_xlabel("Age Depreciation Rate", fontsize=10)
from collections import Counter
n = 3
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Age and Mileage Depreciation Rate for 310 Models", fontsize=30)
type_list = [x[0] for x in Counter(list(vehicle["type"])).most_common()]
type_list = [x for x in type_list if str(x) != 'nan'][:n**2]
for i in range(len(type_list)):
car_type = type_list[i]
new_df = df_4[df_4["types_str"].str.contains(car_type,case=False)]
g = sns.regplot(x="age_depre_rate",y="mileage_depre_rate (*10**6)",\
data=new_df,scatter_kws={'alpha':0.3},lowess=False,\
robust=True,ci=None,\
line_kws={"color": "red"},ax=axs[i//n,i%n])
g.set_xlim([-0.2,0])
g.set_ylim([-20,0])
g.set_title("Age and Mileage Depreciation Rate\n for "+car_type.capitalize(), fontsize=10)
g.set_xlabel("Age Depreciation Rate", fontsize=10)
g.set_ylabel("Mileage Depreciation Rate", fontsize=10)
g = sns.regplot(x="age_0_price",y="mileage_0_price",\
data=df_4,scatter_kws={'alpha':0.3},lowess=True,\
robust=False,\
line_kws={"color": "red"})
g.map_dataframe(sns.lineplot, 'x', 'y_line', color='g')
g.set_xlim([0,100000])
g.set_ylim([0,100000])
g.set_title(" Prices for 310 Models of Cars When Age and Mileage are 0", fontsize=15)
g.set_xlabel("Price When Age is 0", fontsize=10)
g.set_ylabel("Price When Mileage is 0", fontsize=10)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Input In [63], in <cell line: 5>() 1 g = sns.regplot(x="age_0_price",y="mileage_0_price",\ 2 data=df_4,scatter_kws={'alpha':0.3},lowess=True,\ 3 robust=False,\ 4 line_kws={"color": "red"}) ----> 5 g.map_dataframe(sns.lineplot, 'x', 'y_line', color='g') 6 g.set_xlim([0,100000]) 7 g.set_ylim([0,100000]) AttributeError: 'AxesSubplot' object has no attribute 'map_dataframe'
fig, ax = plt.subplots(figsize=(6, 4))
g = sns.regplot(x="age_0_price",y="mileage_0_price",\
data=df_4,scatter_kws={'alpha':0.3},lowess=True,\
robust=False,\
line_kws={"color": "red"}, ax=ax)
g.set_xlim([0,100000])
g.set_ylim([0,100000])
g.set_title(" Prices for 310 Models of Cars When Age and Mileage are 0", fontsize=15)
g.set_xlabel("Price When Age is 0", fontsize=10)
g.set_ylabel("Price When Mileage is 0", fontsize=10)
p2 = sns.lineplot(x=np.linspace(100000,10), y=np.linspace(100000,10), color='g', ax=ax)
g = sns.regplot(x="age_0_price",y="mileage_0_price",\
data=df_4,scatter_kws={'alpha':0.3},lowess=True,\
robust=False,\
line_kws={"color": "red"})
g.set_xlim([0,100000])
g.set_ylim([0,100000])
g.set_title(" Prices for 310 Models of Cars When Age and Mileage are 0", fontsize=15)
g.set_xlabel("Price When Age is 0", fontsize=10)
g.set_ylabel("Price When Mileage is 0", fontsize=10)
Text(0, 0.5, 'Price When Mileage is 0')
g = sns.regplot(x="age_0_price",y="mileage_0_price",\
data=df_4,scatter_kws={'alpha':0.3},lowess=False,\
robust=False,\
line_kws={"color": "red"})
g.set_xlim([0,100000])
g.set_ylim([0,100000])
g.set_title(" Prices for 310 Models of Cars When Age and Mileage are 0", fontsize=15)
g.set_xlabel("Price When Age is 0", fontsize=10)
g.set_ylabel("Price When Mileage is 0", fontsize=10)
Text(0, 0.5, 'Price When Mileage is 0')
g = sns.regplot(x="age_0_price",y="mileage_0_price",\
data=df_4,scatter_kws={'alpha':0.3},lowess=False,\
robust=True,\
line_kws={"color": "red"})
g.set_xlim([0,100000])
g.set_ylim([0,100000])
g.set_title(" Prices for 310 Models of Cars When Age and Mileage are 0", fontsize=15)
g.set_xlabel("Price When Age is 0", fontsize=10)
g.set_ylabel("Price When Mileage is 0", fontsize=10)
Text(0, 0.5, 'Price When Mileage is 0')
model="dodge"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Mileage of Cars Manufactured by "+\
model.capitalize()+" of Different Models", fontsize=30)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["odometer"]>=8000) & (df_1["odometer"]<=300000)\
& (df_1["odometer"]<=np.quantile(list(df_1["odometer"]), .85))]
data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)
try:
model1 = sm.RLM(df_2["log_price"], sm.add_constant(df_2["odometer"]),\
M=sm.robust.norms.HuberT(t=2)).fit()
except:
pass
g1 = sns.regplot(x="odometer",y=np.log(df_2["price"]),data=df_2,ax=axs[i//n,i%n],\
scatter_kws={'alpha':0.1},lowess=False,\
robust=True,ci=None,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-5000,280000])
g1.set_ylim([7.5,12])
g1.set_xlabel("Mileage", fontsize=10)
g1.set_ylabel("Logged Price", fontsize=10)
try:
g1.text(50000,11.5,"y="+str(round(model1.params["odometer"],9))+"x+"+\
str(round(model1.params["const"],2)), fontsize=15)
except:
pass
model="bmw"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=30)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"]
.astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["odometer"]>=1000) & (df_1["odometer"]<=200000)]
model1 = smf.ols('log_price ~ odometer', data=df_2).fit()
g1 = sns.regplot(x="odometer",y=np.log(df_1["price"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=False,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-5000,280000])
g1.set_ylim([7.5,12])
g1.set_xlabel("Mileage", fontsize=10)
g1.set_ylabel("Logged Price", fontsize=10)
g1.text(80000,11.5,"y="+str(round(model1.params["odometer"],8))+"x+"+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(80000,11,"mileage p-value: "+str(round(model1.pvalues["odometer"],3)), fontsize=15)
g1.text(80000,10.5,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
g1.text(80000,10,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
model="nissan"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price, age and Mileage of Cars Manufactured by "+\
model.capitalize()+" of Different Models", fontsize=30)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
#pca_model = skd.PCA().fit(df_hept_norm.iloc[:,:7])
df_2 = df_1[(df_1["odometer"]>=10) & (df_1["odometer"]<=300000)\
& (df_1["age"]>=1) & (df_1["age"]<=20)]
model1 = smf.ols('log_price ~ age + odometer', data=df_2).fit()
g1 = sns.regplot(x=[0],y=[0],data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=True,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([0,10])
g1.set_ylim([0,10])
g1.set_xlabel("", fontsize=10)
g1.set_ylabel("", fontsize=10)
g1.text(1,8,model.capitalize()+" "+car_list[i].capitalize(), fontsize=15)
g1.text(1,6,"ln(price)="+str(round(model1.params["odometer"],8))+"*mileage\n"\
+str(round(model1.params["age"],3))+"*age+"\
+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(1,4,"mileage p-value: "+str(round(model1.pvalues["odometer"],3)), fontsize=15)
g1.text(1,3,"age p-value: "+str(round(model1.pvalues["age"],3)), fontsize=15)
g1.text(1,2,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
g1.text(1,1,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import sklearn.decomposition as skd
import sklearn.preprocessing as skp
model="nissan"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price, age and Mileage of Cars Manufactured by "+\
model.capitalize()+" of Different Models\n Using PCA regression", fontsize=30)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
#pca_model = skd.PCA().fit(df_hept_norm.iloc[:,:7])
df_2 = df_1[(df_1["odometer"]>=10) & (df_1["odometer"]<=300000)\
& (df_1["age"]>=1) & (df_1["age"]<=20)]
df_2[["price","age","odometer"]] = df_2[["price","age","odometer"]].astype(float)
df_2[["price","age","odometer"]] = skp.scale(df_2[["price","age","odometer"]])
df_2 = df_2[["model","price","log_price","age","odometer"]]
pca_model = skd.PCA().fit(df_2[["age","odometer"]])
df_2[["PC1","PC2"]] = pca_model.transform(df_2[["age","odometer"]])
model1 = smf.ols('log_price ~ PC1', data=df_2).fit()
g1 = sns.regplot(x=[0],y=[0],data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=True,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([0,10])
g1.set_ylim([0,10])
g1.set_xlabel("", fontsize=10)
g1.set_ylabel("", fontsize=10)
g1.text(1,9,model.capitalize()+" "+car_list[i].capitalize(), fontsize=15)
g1.text(1,8,"ln(price)="\
+str(round(model1.params["PC1"],3))+"*PC1+"\
+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(1,7,"PC1: ("+str(round(pca_model.components_[0][0],3))\
+", "+str(round(pca_model.components_[0][1],3))+")", fontsize=15)
g1.text(1,6,"PC1 p-value: "+str(round(model1.pvalues["PC1"],3)), fontsize=15)
g1.text(1,5,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
g1.text(1,4,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
g1.text(1,2,"PC1,PC2 variance: \n"+str(round(pca_model.explained_variance_[0],3))\
+", "+str(round(pca_model.explained_variance_[1],3)), fontsize=15)
model="ford"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=30)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_1["log_price"] = np.log(df_1["price"])
df_2 = df_1[(df_1["odometer"]>=1000) & (df_1["odometer"]<=200000)]
model1 = smf.ols('odometer ~ age', data=df_2).fit()
g1 = sns.regplot(x="odometer",y=np.log(df_1["price"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=True,\
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-5000,280000])
g1.set_ylim([7.5,12])
g1.set_xlabel("Age", fontsize=10)
g1.set_ylabel("Mileage", fontsize=10)
g1.text(80000,11.5,"y="+str(round(model1.params["odometer"],8))+"x+"+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(80000,11,"Age p-value: "+str(round(model1.pvalues["odometer"],3)), fontsize=15)
model="chevrolet"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Age and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=20)
print("Price and Logged Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_2 = df_1[(df_1["odometer"]>=10) & (df_1["odometer"]<=300000)\
& (df_1["age"]>=1) & (df_1["age"]<=15)]
model1 = smf.ols('odometer ~ age', data=df_2).fit()
g1 = sns.regplot(x="age",y="odometer",data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1}\
,lowess=True, \
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2,40])
g1.set_ylim([-2000,340000])
g1.set_xlabel("Age")
g1.set_ylabel("Mileage")
g1.text(5,280000,"odometer="+str(round(model1.params["age"],3))+"*age\n+"\
+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(5,260000,"age p-value: "+str(round(model1.pvalues["age"],3)), fontsize=15)
g1.text(5,240000,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
g1.text(5,220000,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
Price and Logged Age of Cars Manufactured by Chevrolet of Different Models
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import sklearn.decomposition as skd
import sklearn.preprocessing as skp
model="ford"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Age and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=20)
print("Price and Logged Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
df_2 = df_1[(df_1["odometer"]>=10) & (df_1["odometer"]<=300000)\
& (df_1["age"]>=1) & (df_1["age"]<=15)]
df_2[["price","age","odometer"]] = df_2[["price","age","odometer"]].astype(float)
df_2[["price","age","odometer"]] = skp.scale(df_2[["price","age","odometer"]])
df_2["log_price"] = np.log(df_2["price"])
df_2 = df_2[["model","price","log_price","age","odometer"]]
pca_model = skd.PCA().fit(df_2[["age","odometer"]])
df_2[["PC1","PC2"]] = pca_model.transform(df_2[["age","odometer"]])
model1 = smf.ols('odometer ~ age', data=df_2).fit()
g1 = sns.regplot(x="age",y="odometer",data=df_2,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1}\
,lowess=True, \
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2,4])
g1.set_ylim([-2,4])
g1.set_xlabel("Nomalized Age")
g1.set_ylabel("Nomalized Mileage")
# Scatterplot of disaggregated data
# Add variable unit vector projections
X = pca_model.transform(df_2[["age","odometer"]])
V = pca_model.transform(np.identity(X.shape[1]))
for i, v in enumerate(V):
g1.annotate(df_2[["PC1","PC2"]].columns[i],
xy=(0,0), xytext=v[:2]*3,
fontsize=15, color='orange',
arrowprops=dict(
arrowstyle='<-', linewidth=2, color='orange'))
g1.text(-2,3.5,"odometer="+str(round(model1.params["age"],3))+"*age+"\
+str(round(model1.params["Intercept"],2)), fontsize=15)
g1.text(-2,3,"age p-value: "+str(round(model1.pvalues["age"],3)), fontsize=15)
g1.text(2,3,"F p-value: "+str(round(model1.f_pvalue,3)), fontsize=15)
g1.text(-2,2.5,"R2: "+str(round(model1.rsquared,3)), fontsize=15)
Price and Logged Age of Cars Manufactured by Ford of Different Models
model="nissan"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Age and Logged Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=20)
print("Price and Logged Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
g1 = sns.regplot(x=np.log(df_1["age"]),y=np.log(df_1["odometer"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1}\
,lowess=True, \
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-0.2,4])
g1.set_ylim([9,13])
g1.set_xlabel("Logged Age")
g1.set_ylabel("Logged Mileage")
Price and Logged Age of Cars Manufactured by Nissan of Different Models
/tmp/ipykernel_1568744/1396034511.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1568744/1396034511.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1568744/1396034511.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
Counter(list(vehicle_1[vehicle_1["manufacturer"].notna()]["manufacturer"])).most_common()[:20]
[('ford', 25651),
('chevrolet', 18388),
('toyota', 11116),
('honda', 7005),
('nissan', 6750),
('ram', 6604),
('jeep', 6560),
('gmc', 5477),
('dodge', 4679),
('bmw', 4401),
('hyundai', 3558),
('mercedes-benz', 3478),
('subaru', 3394),
('kia', 2952),
('volkswagen', 2559),
('lexus', 2320),
('chrysler', 2134),
('cadillac', 2086),
('audi', 2019),
('mazda', 1602)]
model="bmw"
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=20)
print("Price and Logged Age of Cars Manufactured by "+model.capitalize()+" of Different Models")
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
g1 = sns.regplot(x="odometer",y="price",data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=True, \
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2000,320000])
g1.set_ylim([0,80000])
g1.set_xlabel("Mileage")
g1.set_ylabel("Price")
Price and Logged Age of Cars Manufactured by Bmw of Different Models
/tmp/ipykernel_2639724/3918321930.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_2639724/3918321930.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_2639724/3918321930.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Logged Price and Mileage of Cars Manufactured by "+model.capitalize()+" of Different Models", fontsize=20)
df = vehicle_1[vehicle_1["manufacturer"]==model]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
car_list = [x[0] for x in Counter(list(df[df["model"].notna()]\
["model"])).most_common()][:n**2]
for i in range(len(car_list)):
df_1 = vehicle_1[vehicle_1["model"]==car_list[i]]
g1 = sns.regplot(x="odometer",y=np.log(df_1["price"]),data=df_1,ax=axs[i//n,i%n],scatter_kws={'alpha':0.1},lowess=True, \
line_kws={"color": "red"})
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-2000,320000])
g1.set_ylim([7.5,12.5])
g1.set_xlabel("Mileage")
g1.set_ylabel("Logged Price")
/tmp/ipykernel_2639724/3083148546.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_2639724/3083148546.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_2639724/3083148546.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Age of Cars Manufactured by Ford of Different Models", fontsize=20)
for i in range(len(car_list))[:n**2]:
df_1 = vehicle_1[vehicle_1["manufacturer"]==car_list[i]]
g1 = sns.scatterplot(x="age",y="price",data=df_1,ax=axs[i//n,i%n],alpha=0.01)
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([0,40])
g1.set_ylim([0,80000])
g1.set_xlabel("Age")
g1.set_ylabel("Price")
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Mileage of Cars of Different Manufacturers", fontsize=20)
for i in range(len(car_list))[:n**2]:
df_1 = vehicle_1[vehicle_1["manufacturer"]==car_list[i]]
g1 = sns.scatterplot(x="odometer",y="price",data=df_1,ax=axs[i//n,i%n],alpha=0.01)
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([0,320000])
g1.set_ylim([0,80000])
g1.set_xlabel("Mileage")
g1.set_ylabel("Price")
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Age and Mileage of Cars of Different Manufacturers", fontsize=20)
for i in range(len(car_list))[:n**2]:
df_1 = vehicle_1[vehicle_1["manufacturer"]==car_list[i]]
g1 = sns.scatterplot(x="age",y="odometer",data=df_1,ax=axs[i//n,i%n],alpha=0.01)
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([0,40])
g1.set_ylim([0,320000])
g1.set_xlabel("Age")
g1.set_ylabel("Mileage")
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Logged Mileage of Cars of Different Manufacturers", fontsize=20)
for i in range(len(car_list))[:n**2]:
df_1 = vehicle_1[vehicle_1["manufacturer"]==car_list[i]]
g1 = sns.scatterplot(x=np.log(df_1["odometer"]),y="price",data=df_1,ax=axs[i//n,i%n],alpha=0.01)
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([9,13])
g1.set_ylim([0,80000])
g1.set_xlabel("Logged Mileage")
g1.set_ylabel("Price")
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
n = 4
fig, axs = plt.subplots(n, n, figsize=(20, 20))
fig.suptitle("Price and Logged Age of Cars of Different Manufacturers", fontsize=20)
for i in range(len(car_list))[:n**2]:
df_1 = vehicle_1[vehicle_1["manufacturer"]==car_list[i]]
g1 = sns.scatterplot(x=np.log(df_1["age"]),y="price",data=df_1,ax=axs[i//n,i%n],alpha=0.01)
g1.set_title(car_list[i], fontsize=15)
g1.set_xlim([-0.5,4])
g1.set_ylim([0,80000])
g1.set_xlabel("Logged Age")
g1.set_ylabel("Price")
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
matrix_8 = vehicle_1[["price", "age", "odometer"]].corr()
g8 = sns.heatmap(matrix_8,annot=True, vmin=-1, vmax=1)
g8.set_title('Correlation Matrix of Cars', fontsize=15)
Text(0.5, 1.0, 'Correlation Matrix of Cars')
sns.set(rc={'figure.figsize':(15,10)})
ax = sns.violinplot(data=df, x="condition", y="odometer")
ax.set_ylim([-50000, 400000])
ax.set_title("Condition types and Mileage of Ford f-150",fontsize=20)
ax.set_ylabel("Mileage",fontsize=15)
ax.set_xlabel("Conditions",fontsize=15)
Text(0.5, 0, 'Conditions')
sns.set(rc={'figure.figsize':(15,10)})
ax = sns.violinplot(data=df, x="condition", y="age")
ax.set_ylim([-5, 60])
ax.set_title("Condition types and Age of Ford f-150",fontsize=20)
ax.set_ylabel("Age",fontsize=15)
ax.set_xlabel("Conditions",fontsize=15)
Text(0.5, 0, 'Conditions')
sns.set(rc={'figure.figsize':(15,10)})
ax = sns.violinplot(data=df, x="condition", y="price")
ax.set_ylim([-10000, 80000])
ax.set_title("Condition types and Price of Ford f-150",fontsize=20)
ax.set_ylabel("Price",fontsize=15)
ax.set_xlabel("Conditions",fontsize=15)
Text(0.5, 0, 'Conditions')
from collections import Counter
Counter(df["condition"])
Counter({'good': 949,
'excellent': 1773,
'like new': 237,
'fair': 134,
'salvage': 11,
'new': 8})
Counter(df["fuel"])
Counter({'gas': 3097, 'diesel': 3, 'other': 11, 'hybrid': 1})
Counter(df["cylinders"])
Counter({'8 cylinders': 1372,
'6 cylinders': 926,
nan: 791,
'3 cylinders': 1,
'10 cylinders': 11,
'4 cylinders': 8,
'other': 3})
Counter(df["type"])
Counter({'truck': 1527,
'pickup': 1169,
nan: 378,
'van': 7,
'other': 16,
'sedan': 6,
'offroad': 3,
'hatchback': 1,
'coupe': 1,
'SUV': 4})
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15,5))
fig, ax = plt.subplots()
sns.histplot(df["age"],kde=True,ax=ax)
<AxesSubplot: xlabel='age', ylabel='Count'>
<Figure size 1080x360 with 0 Axes>
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15,5))
fig, ax = plt.subplots()
sns.histplot(df["age"],kde=True,ax=ax)
ax.set_xlim(-5,50)
(-5.0, 50.0)
<Figure size 1080x360 with 0 Axes>
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
fig, ax = plt.subplots()
sns.histplot(df["odometer"],kde=True, ax=ax)
ax.set_xlim(-5000, 500000)
(-5000.0, 500000.0)
<Figure size 1440x360 with 0 Axes>
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
fig, ax = plt.subplots()
sns.histplot(df["odometer"],kde=True, ax=ax,binwidth=2000)
ax.set_xlim(-5000, 300000)
(-5000.0, 300000.0)
<Figure size 1440x360 with 0 Axes>
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
logged_odometer = np.log(df["odometer"])
plt.figure(figsize=(20,5))
fig, ax = plt.subplots()
sns.histplot(logged_odometer,kde=True, ax=ax)
ax.set_xlim(0, 20)
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
(0.0, 20.0)
<Figure size 1440x360 with 0 Axes>
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
sns.histplot(df["odometer"],kde=True)
<AxesSubplot: xlabel='odometer', ylabel='Count'>
sns.jointplot(x='age',y='price',data=df,kind="hist",height=10,xlim=(0,40))
<seaborn.axisgrid.JointGrid at 0x14e9a11fba00>
sns.kdeplot(x='age',y='price',data=df,cmap="Reds", shade=True, bw_adjust=.5)
/tmp/ipykernel_2985797/4078052607.py:1: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(x='age',y='price',data=df,cmap="Reds", shade=True, bw_adjust=.5)
<AxesSubplot: xlabel='age', ylabel='price'>
sns.kdeplot(x='age',y='price',data=df,cmap="Reds", shade=True, thresh=0)
/tmp/ipykernel_2985797/2079726061.py:1: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(x='age',y='price',data=df,cmap="Reds", shade=True, thresh=0)
<AxesSubplot: xlabel='age', ylabel='price'>
import matplotlib.pyplot as plt
plt.hist2d(x='age',y='price',data=df, bins=(100, 100), cmap=plt.cm.jet)
plt.show()
df = vehicle_1[vehicle_1["model"]=='f-150']
df["year"] = df["year"].astype(int)
df["age"] = (2021 - df["year"]).astype(float)
df["price"] = df["price"].astype(float)
df = df[df["price"]!=0]
df.shape
/tmp/ipykernel_4151917/4065695957.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_4151917/4065695957.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = (2021 - df["year"]).astype(float) /tmp/ipykernel_4151917/4065695957.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
(3430, 27)
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="price",data=df,alpha=0.1)
plt.xlabel("Vehicle Age (Year)", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Age of Secondhand Ford f-150", fontsize=20)
plt.xlim([-2,50])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(15,10))
plt.scatter(x=np.log(df["age"]), y="price",data=df,alpha=0.1)
plt.xlabel("Logged Vehicle Age (log(Year))", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Logged Age of Secondhand Ford f-150", fontsize=20)
plt.xlim([-0.2,4])
plt.show()
df["price"].dtypes
dtype('float64')
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.figure(figsize=(15,10))
g = sns.lmplot(x=np.log(df["age"]), y="price",data=df,alpha=0.1)
plt.xlabel("Logged Vehicle Age (log(Year))", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Logged Age of Secondhand Ford f-150", fontsize=20)
plt.xlim([-0.2,4])
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [18], in <cell line: 5>() 3 import seaborn as sns 4 plt.figure(figsize=(15,10)) ----> 5 g = sns.lmplot(x=np.log(df["age"]), y="price",data=df,alpha=0.1) 6 plt.xlabel("Logged Vehicle Age (log(Year))", fontsize=15) 7 plt.ylabel("Vehicle Price (USD)", fontsize=15) TypeError: lmplot() got an unexpected keyword argument 'alpha'
<Figure size 1080x720 with 0 Axes>
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(x="odometer", y="price",data=df,alpha=0.1)
plt.xlabel("Vehicle Mileage (Mile)", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Mileage of Secondhand Ford f-150", fontsize=20)
plt.xlim([-5000,320000])
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="odometer",data=df,alpha=0.1)
plt.xlabel("Vehicle Age (Year)", fontsize=15)
plt.ylabel("Vehicle Mileage (Mile)", fontsize=15)
plt.title("Age and Mileage of Secondhand Ford f-150", fontsize=20)
plt.xlim([-2,40])
plt.ylim([-5000,350000])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(15,10))
plt.scatter(x=np.log(df["odometer"]), y="price",data=df,alpha=0.1)
plt.xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Logged Mileage of Secondhand Ford f-150", fontsize=20)
plt.xlim([9,13])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="price",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Vehicle Age (Year)", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Age of Secondhand Ford f-150 Categorized by Condition",\
fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([-2,50])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x="age", y="odometer",data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Vehicle Age (Year)", fontsize=15)
plt.ylabel("Vehicle Mileage (Mile)", fontsize=15)
plt.title("Age and Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
plt.xlim([-2,40])
plt.ylim([-5000,350000])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x="odometer", y="price",data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Vehicle Mileage (Mile)", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([-5000,320000])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x=np.log(df[df["condition"]=="excellent"]["odometer"]), y="price",\
data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="fair"]["odometer"]), y="price",\
data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="good"]["odometer"]), y="price",\
data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="like new"]["odometer"]), y="price",\
data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="new"]["odometer"]), y="price",\
data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="salvage"]["odometer"]), y="price",\
data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Vehicle Logged Mileage (log(Mile))", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Logged Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([9,13])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x=np.log(df[df["condition"]=="excellent"]["age"]), y="price",\
data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="fair"]["age"]), y="price",\
data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="good"]["age"]), y="price",\
data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="like new"]["age"]), y="price",\
data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="new"]["age"]), y="price",\
data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x=np.log(df[df["condition"]=="salvage"]["age"]), y="price",\
data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Logged Vehicle Age (log(Year))", fontsize=15)
plt.ylabel("Vehicle Price (USD)", fontsize=15)
plt.title("Price and Logged Age of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([-0.2,4])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="price",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="new"],\
color=colors["new"],alpha=0.1)
plt.scatter(x="age", y="price",data=df[df["condition"]=="salvage"],\
color=colors["salvage"],alpha=0.1)
plt.xlabel("Vehicle Age", fontsize=15)
plt.ylabel("Vehicle Price", fontsize=15)
plt.title("Price and Age of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([-5,50])
plt.ylim([-2000,72000])
plt.show()
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Price and Age of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
axs[0, 0].scatter(x="age", y="price",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.2)
axs[0, 0].set_title('excellent condition', fontsize=15)
axs[0, 0].set_xlabel("Vehicle Age (USD)", fontsize=10)
axs[0, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 0].set_xlim([-2,40])
axs[0, 0].set_ylim([-2000,70000])
axs[0, 0].legend(["excellent condition"],loc="upper right", fontsize=10)
axs[0, 1].scatter(x="age", y="price",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.2)
axs[0, 1].set_title('fair condition', fontsize=15)
axs[0, 1].set_xlabel("Vehicle Age (USD)", fontsize=10)
axs[0, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 1].set_xlim([-2,40])
axs[0, 1].set_ylim([-2000,70000])
axs[0, 1].legend(["fair condition"],loc="upper right", fontsize=10)
axs[1, 0].scatter(x="age", y="price",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.2)
axs[1, 0].set_title('good condition', fontsize=15)
axs[1, 0].set_xlabel("Vehicle Age (USD)", fontsize=10)
axs[1, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 0].set_xlim([-2,40])
axs[1, 0].set_ylim([-2000,70000])
axs[1, 0].legend(["good condition"],loc="upper right", fontsize=10)
axs[1, 1].scatter(x="age", y="price",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.2)
axs[1, 1].set_title('like new condition', fontsize=15)
axs[1, 1].set_xlabel("Vehicle Age (USD)", fontsize=10)
axs[1, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 1].set_xlim([-2,40])
axs[1, 1].set_ylim([-2000,70000])
axs[1, 1].legend(["like new condition"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cac209d0>
plt.xlim([-2,40])
plt.ylim([-5000,350000])
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Age and Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
axs[0, 0].scatter(x="age", y="odometer",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.2)
axs[0, 0].set_title('excellent condition', fontsize=15)
axs[0, 0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0, 0].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0, 0].set_xlim([-2,40])
axs[0, 0].set_ylim([-5000,350000])
axs[0, 0].legend(["excellent condition"],loc="upper right", fontsize=10)
axs[0, 1].scatter(x="age", y="odometer",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.2)
axs[0, 1].set_title('fair condition', fontsize=15)
axs[0, 1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0, 1].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0, 1].set_xlim([-2,40])
axs[0, 1].set_ylim([-5000,350000])
axs[0, 1].legend(["fair condition"],loc="upper right", fontsize=10)
axs[1, 0].scatter(x="age", y="odometer",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.2)
axs[1, 0].set_title('good condition', fontsize=15)
axs[1, 0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1, 0].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1, 0].set_xlim([-2,40])
axs[1, 0].set_ylim([-5000,350000])
axs[1, 0].legend(["good condition"],loc="upper right", fontsize=10)
axs[1, 1].scatter(x="age", y="odometer",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.2)
axs[1, 1].set_title('like new condition', fontsize=15)
axs[1, 1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1, 1].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1, 1].set_xlim([-2,40])
axs[1, 1].set_ylim([-5000,350000])
axs[1, 1].legend(["like new condition"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8caa1d420>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Price and Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
axs[0, 0].scatter(x="odometer", y="price",data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.2)
axs[0, 0].set_title('excellent condition', fontsize=15)
axs[0, 0].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 0].set_xlim([-2000,320000])
axs[0, 0].set_ylim([-2000,70000])
axs[0, 0].legend(["excellent condition"],loc="upper right", fontsize=10)
axs[0, 1].scatter(x="odometer", y="price",data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.2)
axs[0, 1].set_title('fair condition', fontsize=15)
axs[0, 1].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 1].set_xlim([-2000,320000])
axs[0, 1].set_ylim([-2000,70000])
axs[0, 1].legend(["fair condition"],loc="upper right", fontsize=10)
axs[1, 0].scatter(x="odometer", y="price",data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.2)
axs[1, 0].set_title('good condition', fontsize=15)
axs[1, 0].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 0].set_xlim([-2000,320000])
axs[1, 0].set_ylim([-2000,70000])
axs[1, 0].legend(["good condition"],loc="upper right", fontsize=10)
axs[1, 1].scatter(x="odometer", y="price",data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.2)
axs[1, 1].set_title('like new condition', fontsize=15)
axs[1, 1].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 1].set_xlim([-2000,320000])
axs[1, 1].set_ylim([-2000,70000])
axs[1, 1].legend(["like new condition"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8ca819390>
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Price and Logged Mileage of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
axs[0, 0].scatter(x=np.log(df[df["condition"]=="excellent"]["odometer"]), y="price",\
data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.2)
axs[0, 0].set_title('excellent condition', fontsize=15)
axs[0, 0].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[0, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 0].set_xlim([9,13])
axs[0, 0].set_ylim([-2000,70000])
axs[0, 0].legend(["excellent condition"],loc="upper right", fontsize=10)
axs[0, 1].scatter(x=np.log(df[df["condition"]=="fair"]["odometer"]), y="price",\
data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.2)
axs[0, 1].set_title('fair condition', fontsize=15)
axs[0, 1].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[0, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[0, 1].set_xlim([9,13])
axs[0, 1].set_ylim([-2000,70000])
axs[0, 1].legend(["fair condition"],loc="upper right", fontsize=10)
axs[1, 0].scatter(x=np.log(df[df["condition"]=="good"]["odometer"]), y="price",\
data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.2)
axs[1, 0].set_title('good condition', fontsize=15)
axs[1, 0].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[1, 0].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 0].set_xlim([9,13])
axs[1, 0].set_ylim([-2000,70000])
axs[1, 0].legend(["good condition"],loc="upper right", fontsize=10)
axs[1, 1].scatter(x=np.log(df[df["condition"]=="like new"]["odometer"]), y="price",\
data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.2)
axs[1, 1].set_title('like new condition', fontsize=15)
axs[1, 1].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[1, 1].set_ylabel("Vehicle Price (Year)", fontsize=10)
axs[1, 1].set_xlim([9,13])
axs[1, 1].set_ylim([-2000,70000])
axs[1, 1].legend(["like new condition"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cd5ad600>
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
fig.suptitle("Price and Logged Age of Secondhand Ford f-150 Categorized by Condition", fontsize=20)
axs[0, 0].scatter(x=np.log(df[df["condition"]=="excellent"]["age"]), y="price",\
data=df[df["condition"]=="excellent"],\
color=colors["excellent"],alpha=0.2)
axs[0, 0].set_title('excellent condition', fontsize=15)
axs[0, 0].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[0, 0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0, 0].set_xlim([-0.5,4])
axs[0, 0].set_ylim([-2000,72000])
axs[0, 0].legend(["excellent condition"],loc="upper right", fontsize=10)
axs[0, 1].scatter(x=np.log(df[df["condition"]=="fair"]["age"]), y="price",\
data=df[df["condition"]=="fair"],\
color=colors["fair"],alpha=0.2)
axs[0, 1].set_title('fair condition', fontsize=15)
axs[0, 1].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[0, 1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0, 1].set_xlim([-0.5,4])
axs[0, 1].set_ylim([-2000,72000])
axs[0, 1].legend(["fair condition"],loc="upper right", fontsize=10)
axs[1, 0].scatter(x=np.log(df[df["condition"]=="good"]["age"]), y="price",\
data=df[df["condition"]=="good"],\
color=colors["good"],alpha=0.2)
axs[1, 0].set_title('good condition', fontsize=15)
axs[1, 0].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[1, 0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1, 0].set_xlim([-0.5,4])
axs[1, 0].set_ylim([-2000,72000])
axs[1, 0].legend(["good condition"],loc="upper right", fontsize=10)
axs[1, 1].scatter(x=np.log(df[df["condition"]=="like new"]["age"]), y="price",\
data=df[df["condition"]=="like new"],\
color=colors["like new"],alpha=0.2)
axs[1, 1].set_title('like new condition', fontsize=15)
axs[1, 1].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[1, 1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1, 1].set_xlim([-0.5,4])
axs[1, 1].set_ylim([-2000,72000])
axs[1, 1].legend(["like new condition"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8dc477880>
df.columns
Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
'image_url', 'description', 'county', 'state', 'lat', 'long',
'posting_date', 'posting_date_time', 'age'],
dtype='object')
Counter(list(df["title_status"]))
Counter({'clean': 2440,
'missing': 2,
'lien': 33,
'salvage': 22,
'rebuilt': 34,
'parts only': 2})
Counter(list(df["type"]))
Counter({'truck': 1404,
'pickup': 1097,
'van': 5,
'other': 13,
'sedan': 5,
'offroad': 3,
'hatchback': 1,
'coupe': 1,
'SUV': 4})
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Price and Age of Secondhand Ford f-150 Categorized by Type", fontsize=20)
axs[0].scatter(x="age", y="price",data=df[df["type"]=="truck"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('truck type', fontsize=15)
axs[0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2,40])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x="age", y="price",data=df[df["type"]=="pickup"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('pickup type', fontsize=15)
axs[1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-2,40])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["pickup"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14c99374c7f0>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'8 cylinders':'red', '6 cylinders':'orange'}
fig.suptitle("Price and Age of Secondhand Ford f-150 Categorized by Cylinders", fontsize=20)
axs[0].scatter(x="age", y="price",data=df[df["cylinders"]=="8 cylinders"],\
color=colors["8 cylinders"],alpha=0.2)
axs[0].set_title('8 cylinders', fontsize=15)
axs[0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2,40])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["8 cylinders"],loc="upper right", fontsize=10)
axs[1].scatter(x="age", y="price",data=df[df["cylinders"]=="6 cylinders"],\
color=colors["6 cylinders"],alpha=0.2)
axs[1].set_title('6 cylinders', fontsize=15)
axs[1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-2,40])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["6 cylinders"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14c999b8cbe0>
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Price and Logged Age of Secondhand Ford f-150 Categorized by Cylinders", fontsize=20)
axs[0].scatter(x=np.log(df[df["cylinders"]=="8 cylinders"]["age"]), y="price",data=df[df["cylinders"]=="8 cylinders"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('8 cylinders', fontsize=15)
axs[0].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-0.5,4])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["8 cylinders"],loc="upper right", fontsize=10)
axs[1].scatter(x=np.log(df[df["cylinders"]=="6 cylinders"]["age"]), y="price",data=df[df["cylinders"]=="6 cylinders"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('6 cylinders', fontsize=15)
axs[1].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-0.5,4])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["6 cylinders"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14c996a376a0>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'8 cylinders':'red', '6 cylinders':'orange'}
fig.suptitle("Price and Mileage of Secondhand Ford f-150 Categorized by Cylinders", fontsize=20)
axs[0].scatter(x="odometer", y="price",data=df[df["cylinders"]=="8 cylinders"],\
color=colors["8 cylinders"],alpha=0.2)
axs[0].set_title('8 cylinders', fontsize=15)
axs[0].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2000,320000])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x="odometer", y="price",data=df[df["cylinders"]=="6 cylinders"],\
color=colors["6 cylinders"],alpha=0.2)
axs[1].set_title('6 cylinders', fontsize=15)
axs[1].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-2000,320000])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["6 cylinders"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14c996adaec0>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'8 cylinders':'red', '6 cylinders':'orange'}
fig.suptitle("Age and Mileage of Secondhand Ford f-150 Categorized by Cylinders", fontsize=20)
axs[0].scatter(x="age", y="odometer",data=df[df["cylinders"]=="8 cylinders"],\
color=colors["8 cylinders"],alpha=0.2)
axs[0].set_title('8 cylinders', fontsize=15)
axs[0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2,40])
axs[0].set_ylim([-2000,320000])
axs[0].legend(["8 cylinders"],loc="upper right", fontsize=10)
axs[1].scatter(x="age", y="odometer",data=df[df["cylinders"]=="6 cylinders"],\
color=colors["6 cylinders"],alpha=0.2)
axs[1].set_title('6 cylinders', fontsize=15)
axs[1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1].set_xlim([-2,40])
axs[1].set_ylim([-2000,320000])
axs[1].legend(["6 cylinders"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14c996811b70>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Age and Mileage of Secondhand Ford f-150 Categorized by Type", fontsize=20)
axs[0].scatter(x="age", y="odometer",data=df[df["type"]=="truck"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('truck type', fontsize=15)
axs[0].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2,40])
axs[0].set_ylim([-2000,320000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x="age", y="odometer",data=df[df["type"]=="pickup"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('pickup type', fontsize=15)
axs[1].set_xlabel("Vehicle Age (Year)", fontsize=10)
axs[1].set_ylabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1].set_xlim([-2,40])
axs[1].set_ylim([-2000,320000])
axs[1].legend(["pickup"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cbe4a620>
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Price and Mileage of Secondhand Ford f-150 Categorized by Type", fontsize=20)
axs[0].scatter(x="odometer", y="price",data=df[df["type"]=="truck"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('truck type', fontsize=15)
axs[0].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-2000,320000])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x="odometer", y="price",data=df[df["type"]=="pickup"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('pickup type', fontsize=15)
axs[1].set_xlabel("Vehicle Mileage (Mile)", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-2000,320000])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["pickup"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cd284d30>
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Price and Logged Mileage of Secondhand Ford f-150 Categorized by Type", fontsize=20)
axs[0].scatter(x=np.log(df[df["type"]=="truck"]["odometer"]), y="price",data=df[df["type"]=="truck"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('truck type', fontsize=15)
axs[0].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([9,13])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x=np.log(df[df["type"]=="pickup"]["odometer"]), y="price",data=df[df["type"]=="pickup"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('pickup type', fontsize=15)
axs[1].set_xlabel("Logged Vehicle Mileage (log(Mile))", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([9,13])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["pickup"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cb5f1a80>
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(1, 2, figsize=(20, 10))
colors = {'truck':'red', 'pickup':'orange'}
fig.suptitle("Price and Logged Age of Secondhand Ford f-150 Categorized by Type", fontsize=20)
axs[0].scatter(x=np.log(df[df["type"]=="truck"]["age"]), y="price",data=df[df["type"]=="truck"],\
color=colors["truck"],alpha=0.2)
axs[0].set_title('truck type', fontsize=15)
axs[0].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[0].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[0].set_xlim([-0.2,4])
axs[0].set_ylim([-2000,70000])
axs[0].legend(["truck"],loc="upper right", fontsize=10)
axs[1].scatter(x=np.log(df[df["type"]=="pickup"]["age"]), y="price",data=df[df["type"]=="pickup"],\
color=colors["pickup"],alpha=0.2)
axs[1].set_title('pickup type', fontsize=15)
axs[1].set_xlabel("Logged Vehicle Age (log(Year))", fontsize=10)
axs[1].set_ylabel("Vehicle Price (USD)", fontsize=10)
axs[1].set_xlim([-0.2,4])
axs[1].set_ylim([-2000,70000])
axs[1].legend(["pickup"],loc="upper right", fontsize=10)
<matplotlib.legend.Legend at 0x14b8cb21fe20>
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', \
'good':'yellow', 'like new':'blue', \
'new':'green', 'salvage':'purple'}
plt.figure(figsize=(15,10))
plt.scatter(x="age", y="price",data=df, \
c=df["condition"].astype('category').map(colors),alpha=0.3)
plt.xlabel("Vehicle Age", fontsize=15)
plt.ylabel("Vehicle Price", fontsize=15)
plt.title("Price and Age of Secondhand Ford f-150", fontsize=20)
plt.legend(['excellent: red', 'fair: orange', \
'good: yellow', 'like new: blue', \
'new: green', 'salvage: purple'], loc ="upper right", fontsize=15)
plt.xlim([-5,50])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple'}
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"],c=df["condition"].astype('category').map(colors),alpha=0.5)
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5,40])
plt.show()
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple'}
alphas = {'excellent':0.5, 'fair':0.5, 'good':0.5, 'like new':0.5, 'new':0.5, 'salvage':0.5,'none':0}
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"],c=df["condition"].astype('category').map(colors),alpha=df["condition"].astype('category').map(alphas))
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5,40])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"],alpha=0.3)
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"],alpha=0.1)
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
alphas = {'excellent':0.5, 'fair':0.5, 'good':0.5, 'like new':0.5, 'new':0.5, 'salvage':0.5,'none':0}
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"],c=df["condition"].astype('category').map(colors),alpha=df["condition"].astype('category').map(alphas))
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"],c=df["condition"].astype('category').map(colors),alpha=0.7)
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 500000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
plt.scatter(df["odometer"], df["price"],c=df["condition"].astype('category').map(colors),alpha=0.7)
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 500000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
plt.scatter(df["odometer"], df["price"],c=df["condition"].astype('category').map(colors),alpha=0.7)
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
alphas = {'excellent':0.5, 'fair':0.5, 'good':0.5, 'like new':0.5, 'new':0.5, 'salvage':0.5,'none':0}
plt.scatter(df["odometer"], df["price"],c=df["condition"].astype('category').map(colors),alpha=df["condition"].astype('category').map(alphas))
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"],c=df["condition"].astype('category').map(colors),alpha=0.7)
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
colors = {'excellent':'red', 'fair':'orange', 'good':'yellow', 'like new':'blue', 'new':'green', 'salvage':'purple','none':'pink'}
alphas = {'excellent':0.5, 'fair':0.5, 'good':0.5, 'like new':0.5, 'new':0.5, 'salvage':0.5,'none':0}
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"],c=df["condition"].astype('category').map(colors),alpha=df["condition"].astype('category').map(alphas))
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
df = vehicle_2[vehicle_2["model"]=='silverado 1500']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
/tmp/ipykernel_1816880/2323307577.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/2323307577.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/2323307577.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-2,25])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
df = vehicle_2[vehicle_2["model"]=='1500']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
/tmp/ipykernel_1816880/2848382175.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/2848382175.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/2848382175.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
df = vehicle_2[vehicle_2["model"]=='camry']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
/tmp/ipykernel_1816880/1492278436.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/1492278436.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/1492278436.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
df = vehicle_2[vehicle_2["model"]=='silverado']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
/tmp/ipykernel_1816880/3604883723.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/3604883723.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/3604883723.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
df = vehicle_2[vehicle_2["model"]=='accord']
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
/tmp/ipykernel_1816880/1156269704.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/1156269704.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/1156269704.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["price"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["age"]), df["price"])
plt.xlabel("Age Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.show()
/home/yifeisun/.local/lib/python3.10/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["odometer"], df["price"])
plt.xlabel("Mileage", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-5000, 300000])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(np.log(df["odometer"]), df["price"])
plt.xlabel("Mileage Logged", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([8, 13])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
plt.scatter(df["age"], df["odometer"])
plt.xlabel("Age", fontsize=15)
plt.ylabel("Mileage", fontsize=15)
plt.xlim([-3,50])
plt.ylim([-10000,350000])
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
for i in ['f-150']:
df = vehicle_2[vehicle_2["model"]==i]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
plt.plot(df["age"], df["price"], vehicle_2["model"])
plt.title("Count Percentage of Movies of Different Genres through the years (First 5 Genres)", fontsize=20)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-1, 30])
plt.legend()
plt.show()
/tmp/ipykernel_1816880/16990256.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/16990256.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/16990256.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [19], in <cell line: 3>() 6 df["age"] = 2021 - df["year"] 7 df["price"] = df["price"].astype(float) ----> 8 plt.plot(df["age"], df["price"], vehicle_2["model"]) 9 plt.title("Count Percentage of Movies of Different Genres through the years (First 5 Genres)", fontsize=20) 10 plt.xlabel("Year", fontsize=15) File ~/.local/lib/python3.10/site-packages/matplotlib/pyplot.py:2740, in plot(scalex, scaley, data, *args, **kwargs) 2738 @_copy_docstring_and_deprecators(Axes.plot) 2739 def plot(*args, scalex=True, scaley=True, data=None, **kwargs): -> 2740 return gca().plot( 2741 *args, scalex=scalex, scaley=scaley, 2742 **({"data": data} if data is not None else {}), **kwargs) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1662, in Axes.plot(self, scalex, scaley, data, *args, **kwargs) 1419 """ 1420 Plot y versus x as lines and/or markers. 1421 (...) 1659 (``'green'``) or hex strings (``'#008000'``). 1660 """ 1661 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D) -> 1662 lines = [*self._get_lines(*args, data=data, **kwargs)] 1663 for line in lines: 1664 self.add_line(line) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_base.py:311, in _process_plot_var_args.__call__(self, data, *args, **kwargs) 309 this += args[0], 310 args = args[1:] --> 311 yield from self._plot_args( 312 this, kwargs, ambiguous_fmt_datakey=ambiguous_fmt_datakey) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_base.py:501, in _process_plot_var_args._plot_args(self, tup, kwargs, return_kwargs, ambiguous_fmt_datakey) 499 self.axes.xaxis.update_units(x) 500 if self.axes.yaxis is not None: --> 501 self.axes.yaxis.update_units(y) 503 if x.shape[0] != y.shape[0]: 504 raise ValueError(f"x and y must have same first dimension, but " 505 f"have shapes {x.shape} and {y.shape}") File ~/.local/lib/python3.10/site-packages/matplotlib/axis.py:1608, in Axis.update_units(self, data) 1606 neednew = self.converter != converter 1607 self.converter = converter -> 1608 default = self.converter.default_units(data, self) 1609 if default is not None and self.units is None: 1610 self.set_units(default) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:116, in StrCategoryConverter.default_units(data, axis) 114 # the conversion call stack is default_units -> axis_info -> convert 115 if axis.units is None: --> 116 axis.set_units(UnitData(data)) 117 else: 118 axis.units.update(data) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:192, in UnitData.__init__(self, data) 190 self._counter = itertools.count() 191 if data is not None: --> 192 self.update(data) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:227, in UnitData.update(self, data) 224 convertible = True 225 for val in OrderedDict.fromkeys(data): 226 # OrderedDict just iterates over unique values in data. --> 227 _api.check_isinstance((str, bytes), value=val) 228 if convertible: 229 # this will only be called so long as convertible is True. 230 convertible = self._str_is_convertible(val) File ~/.local/lib/python3.10/site-packages/matplotlib/_api/__init__.py:93, in check_isinstance(_types, **kwargs) 91 names.remove("None") 92 names.append("None") ---> 93 raise TypeError( 94 "{!r} must be an instance of {}, not a {}".format( 95 k, 96 ", ".join(names[:-1]) + " or " + names[-1] 97 if len(names) > 1 else names[0], 98 type_name(type(v)))) TypeError: 'value' must be an instance of str or bytes, not a float
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
for i in list(set(vehicle_2["model"]))[:1]:
df = vehicle_2[vehicle_2["model"]==i]
df["year"] = df["year"].astype(int)
df["age"] = 2021 - df["year"]
df["price"] = df["price"].astype(float)
plt.plot(df["age"], df["price"], vehicle_2["model"])
plt.title("Count Percentage of Movies of Different Genres through the years (First 5 Genres)", fontsize=20)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Price", fontsize=15)
plt.xlim([-1, 30])
plt.legend()
plt.show()
/tmp/ipykernel_1816880/4170661589.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["year"] = df["year"].astype(int) /tmp/ipykernel_1816880/4170661589.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["age"] = 2021 - df["year"] /tmp/ipykernel_1816880/4170661589.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["price"] = df["price"].astype(float)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [9], in <cell line: 3>() 6 df["age"] = 2021 - df["year"] 7 df["price"] = df["price"].astype(float) ----> 8 plt.plot(df["age"], df["price"], vehicle_2["model"]) 9 plt.title("Count Percentage of Movies of Different Genres through the years (First 5 Genres)", fontsize=20) 10 plt.xlabel("Year", fontsize=15) File ~/.local/lib/python3.10/site-packages/matplotlib/pyplot.py:2740, in plot(scalex, scaley, data, *args, **kwargs) 2738 @_copy_docstring_and_deprecators(Axes.plot) 2739 def plot(*args, scalex=True, scaley=True, data=None, **kwargs): -> 2740 return gca().plot( 2741 *args, scalex=scalex, scaley=scaley, 2742 **({"data": data} if data is not None else {}), **kwargs) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_axes.py:1662, in Axes.plot(self, scalex, scaley, data, *args, **kwargs) 1419 """ 1420 Plot y versus x as lines and/or markers. 1421 (...) 1659 (``'green'``) or hex strings (``'#008000'``). 1660 """ 1661 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D) -> 1662 lines = [*self._get_lines(*args, data=data, **kwargs)] 1663 for line in lines: 1664 self.add_line(line) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_base.py:311, in _process_plot_var_args.__call__(self, data, *args, **kwargs) 309 this += args[0], 310 args = args[1:] --> 311 yield from self._plot_args( 312 this, kwargs, ambiguous_fmt_datakey=ambiguous_fmt_datakey) File ~/.local/lib/python3.10/site-packages/matplotlib/axes/_base.py:501, in _process_plot_var_args._plot_args(self, tup, kwargs, return_kwargs, ambiguous_fmt_datakey) 499 self.axes.xaxis.update_units(x) 500 if self.axes.yaxis is not None: --> 501 self.axes.yaxis.update_units(y) 503 if x.shape[0] != y.shape[0]: 504 raise ValueError(f"x and y must have same first dimension, but " 505 f"have shapes {x.shape} and {y.shape}") File ~/.local/lib/python3.10/site-packages/matplotlib/axis.py:1608, in Axis.update_units(self, data) 1606 neednew = self.converter != converter 1607 self.converter = converter -> 1608 default = self.converter.default_units(data, self) 1609 if default is not None and self.units is None: 1610 self.set_units(default) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:116, in StrCategoryConverter.default_units(data, axis) 114 # the conversion call stack is default_units -> axis_info -> convert 115 if axis.units is None: --> 116 axis.set_units(UnitData(data)) 117 else: 118 axis.units.update(data) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:192, in UnitData.__init__(self, data) 190 self._counter = itertools.count() 191 if data is not None: --> 192 self.update(data) File ~/.local/lib/python3.10/site-packages/matplotlib/category.py:227, in UnitData.update(self, data) 224 convertible = True 225 for val in OrderedDict.fromkeys(data): 226 # OrderedDict just iterates over unique values in data. --> 227 _api.check_isinstance((str, bytes), value=val) 228 if convertible: 229 # this will only be called so long as convertible is True. 230 convertible = self._str_is_convertible(val) File ~/.local/lib/python3.10/site-packages/matplotlib/_api/__init__.py:93, in check_isinstance(_types, **kwargs) 91 names.remove("None") 92 names.append("None") ---> 93 raise TypeError( 94 "{!r} must be an instance of {}, not a {}".format( 95 k, 96 ", ".join(names[:-1]) + " or " + names[-1] 97 if len(names) > 1 else names[0], 98 type_name(type(v)))) TypeError: 'value' must be an instance of str or bytes, not a float
pd.to_datetime(vehicle_2["posting_date"])
27 2021-05-04 12:31:18-05:00
28 2021-05-04 12:31:08-05:00
29 2021-05-04 12:31:25-05:00
30 2021-05-04 10:41:31-05:00
31 2021-05-03 14:02:03-05:00
...
426875 2021-04-04 03:21:31-06:00
426876 2021-04-04 03:21:29-06:00
426877 2021-04-04 03:21:17-06:00
426878 2021-04-04 03:21:11-06:00
426879 2021-04-04 03:21:07-06:00
Name: posting_date, Length: 409234, dtype: object
pd.to_datetime(vehicle_2["posting_date"]).sort_values()
271444 2021-04-04 03:00:25-04:00
271443 2021-04-04 03:00:51-04:00
271442 2021-04-04 03:01:16-04:00
419122 2021-04-04 02:03:15-05:00
263503 2021-04-04 01:10:12-06:00
...
406982 2021-05-04 20:58:48-07:00
414522 2021-05-04 23:59:45-04:00
422031 2021-05-04 23:07:07-05:00
418366 2021-05-04 23:12:08-05:00
422030 2021-05-04 23:24:09-05:00
Name: posting_date, Length: 409234, dtype: object
set(pd.to_datetime(vehicle_2["posting_date"]).year)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Input In [23], in <cell line: 1>() ----> 1 set(pd.to_datetime(vehicle_2["posting_date"]).year) File ~/.local/lib/python3.10/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name) 5895 if ( 5896 name not in self._internal_names_set 5897 and name not in self._metadata 5898 and name not in self._accessors 5899 and self._info_axis._can_hold_identifiers_and_holds_name(name) 5900 ): 5901 return self[name] -> 5902 return object.__getattribute__(self, name) AttributeError: 'Series' object has no attribute 'year'
set(vehicle["region"].tolist())
{'SF bay area',
'abilene',
'akron / canton',
'albany',
'albuquerque',
'altoona-johnstown',
'amarillo',
'ames',
'anchorage / mat-su',
'ann arbor',
'annapolis',
'appleton-oshkosh-FDL',
'asheville',
'ashtabula',
'athens',
'atlanta',
'auburn',
'augusta',
'austin',
'bakersfield',
'baltimore',
'baton rouge',
'battle creek',
'beaumont / port arthur',
'bellingham',
'bemidji',
'bend',
'billings',
'binghamton',
'birmingham',
'bismarck',
'bloomington',
'bloomington-normal',
'boise',
'boone',
'boston',
'boulder',
'bowling green',
'bozeman',
'brainerd',
'brownsville',
'brunswick',
'buffalo',
'butte',
'cape cod / islands',
'catskills',
'cedar rapids',
'central NJ',
'central louisiana',
'central michigan',
'champaign urbana',
'charleston',
'charlotte',
'charlottesville',
'chattanooga',
'chautauqua',
'chicago',
'chico',
'chillicothe',
'cincinnati',
'clarksville',
'cleveland',
'clovis / portales',
'college station',
'colorado springs',
'columbia',
'columbia / jeff city',
'columbus',
'cookeville',
'corpus christi',
'corvallis/albany',
'cumberland valley',
'dallas / fort worth',
'danville',
'dayton / springfield',
'daytona beach',
'decatur',
'deep east texas',
'del rio / eagle pass',
'delaware',
'denver',
'des moines',
'detroit metro',
'dothan',
'dubuque',
'duluth / superior',
'east idaho',
'east oregon',
'eastern CO',
'eastern CT',
'eastern NC',
'eastern kentucky',
'eastern montana',
'eastern panhandle',
'eastern shore',
'eau claire',
'el paso',
'elko',
'elmira-corning',
'erie',
'eugene',
'evansville',
'fairbanks',
'fargo / moorhead',
'farmington',
'fayetteville',
'finger lakes',
'flagstaff / sedona',
'flint',
'florence',
'florence / muscle shoals',
'florida keys',
'fort collins / north CO',
'fort dodge',
'fort smith',
'fort smith, AR',
'fort wayne',
'frederick',
'fredericksburg',
'fresno / madera',
'ft myers / SW florida',
'gadsden-anniston',
'gainesville',
'galveston',
'glens falls',
'gold country',
'grand forks',
'grand island',
'grand rapids',
'great falls',
'green bay',
'greensboro',
'greenville / upstate',
'gulfport / biloxi',
'hanford-corcoran',
'harrisburg',
'harrisonburg',
'hartford',
'hattiesburg',
'hawaii',
'heartland florida',
'helena',
'hickory / lenoir',
'high rockies',
'hilton head',
'holland',
'houma',
'houston',
'hudson valley',
'humboldt county',
'huntington-ashland',
'huntsville / decatur',
'imperial county',
'indianapolis',
'inland empire',
'iowa city',
'ithaca',
'jackson',
'jacksonville',
'janesville',
'jersey shore',
'jonesboro',
'joplin',
'kalamazoo',
'kalispell',
'kansas city',
'kansas city, MO',
'kenai peninsula',
'kennewick-pasco-richland',
'kenosha-racine',
'killeen / temple / ft hood',
'kirksville',
'klamath falls',
'knoxville',
'kokomo',
'la crosse',
'la salle co',
'lafayette',
'lafayette / west lafayette',
'lake charles',
'lake of the ozarks',
'lakeland',
'lancaster',
'lansing',
'laredo',
'las cruces',
'las vegas',
'lawrence',
'lawton',
'lehigh valley',
'lewiston / clarkston',
'lexington',
'lima / findlay',
'lincoln',
'little rock',
'logan',
'long island',
'los angeles',
'louisville',
'lubbock',
'lynchburg',
'macon / warner robins',
'madison',
'maine',
'manhattan',
'mankato',
'mansfield',
'mason city',
'mattoon-charleston',
'mcallen / edinburg',
'meadville',
'medford-ashland',
'memphis',
'mendocino county',
'merced',
'meridian',
'milwaukee',
'minneapolis / st paul',
'missoula',
'mobile',
'modesto',
'mohave county',
'monroe',
'monterey bay',
'montgomery',
'morgantown',
'moses lake',
'muncie / anderson',
'muskegon',
'myrtle beach',
'nashville',
'new hampshire',
'new haven',
'new orleans',
'new river valley',
'new york city',
'norfolk / hampton roads',
'north central FL',
'north dakota',
'north jersey',
'north mississippi',
'north platte',
'northeast SD',
'northern WI',
'northern michigan',
'northern panhandle',
'northwest CT',
'northwest GA',
'northwest KS',
'northwest OK',
'ocala',
'odessa / midland',
'ogden-clearfield',
'okaloosa / walton',
'oklahoma city',
'olympic peninsula',
'omaha / council bluffs',
'oneonta',
'orange county',
'oregon coast',
'orlando',
'outer banks',
'owensboro',
'palm springs',
'panama city',
'parkersburg-marietta',
'pensacola',
'peoria',
'philadelphia',
'phoenix',
'pierre / central SD',
'pittsburgh',
'plattsburgh-adirondacks',
'poconos',
'port huron',
'portland',
'potsdam-canton-massena',
'prescott',
'provo / orem',
'pueblo',
'pullman / moscow',
'quad cities, IA/IL',
'raleigh / durham / CH',
'rapid city / west SD',
'reading',
'redding',
'reno / tahoe',
'rhode island',
'richmond',
'roanoke',
'rochester',
'rockford',
'roseburg',
'roswell / carlsbad',
'sacramento',
'saginaw-midland-baycity',
'salem',
'salina',
'salt lake city',
'san angelo',
'san antonio',
'san diego',
'san luis obispo',
'san marcos',
'sandusky',
'santa barbara',
'santa fe / taos',
'santa maria',
'sarasota-bradenton',
'savannah / hinesville',
'scottsbluff / panhandle',
'scranton / wilkes-barre',
'seattle-tacoma',
'sheboygan',
'show low',
'shreveport',
'sierra vista',
'sioux city',
'sioux falls / SE SD',
'siskiyou county',
'skagit / island / SJI',
'south bend / michiana',
'south coast',
'south dakota',
'south florida',
'south jersey',
'southeast IA',
'southeast KS',
'southeast alaska',
'southeast missouri',
'southern WV',
'southern illinois',
'southern maryland',
'southwest KS',
'southwest MN',
'southwest MS',
'southwest TX',
'southwest VA',
'southwest michigan',
'space coast',
"spokane / coeur d'alene",
'springfield',
'st augustine',
'st cloud',
'st george',
'st joseph',
'st louis',
'st louis, MO',
'state college',
'statesboro',
'stillwater',
'stockton',
'susanville',
'syracuse',
'tallahassee',
'tampa bay area',
'terre haute',
'texarkana',
'texoma',
'the thumb',
'toledo',
'topeka',
'treasure coast',
'tri-cities',
'tucson',
'tulsa',
'tuscaloosa',
'tuscarawas co',
'twin falls',
'twin tiers NY/PA',
'tyler / east TX',
'upper peninsula',
'utica-rome-oneida',
'valdosta',
'ventura county',
'vermont',
'victoria',
'visalia-tulare',
'waco',
'washington, DC',
'waterloo / cedar falls',
'watertown',
'wausau',
'wenatchee',
'west virginia (old)',
'western IL',
'western KY',
'western maryland',
'western massachusetts',
'western slope',
'wichita',
'wichita falls',
'williamsport',
'wilmington',
'winchester',
'winston-salem',
'worcester / central MA',
'wyoming',
'yakima',
'york',
'youngstown',
'yuba-sutter',
'yuma',
'zanesville / cambridge'}
len(set(vehicle["region"].tolist()))
404
vehicle_2["url"].iloc[4567]
'https://tuscaloosa.craigslist.org/cto/d/tuscaloosa-1968-chevrolet-caprice/7312092765.html'
carprice = pd.read_csv("CarPrice_Assignment.csv")
carprice
| car_ID | symboling | CarName | fueltype | aspiration | doornumber | carbody | drivewheel | enginelocation | wheelbase | ... | enginesize | fuelsystem | boreratio | stroke | compressionratio | horsepower | peakrpm | citympg | highwaympg | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | alfa-romero giulia | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495.0 |
| 1 | 2 | 3 | alfa-romero stelvio | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500.0 |
| 2 | 3 | 1 | alfa-romero Quadrifoglio | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500.0 |
| 3 | 4 | 2 | audi 100 ls | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950.0 |
| 4 | 5 | 2 | audi 100ls | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 200 | 201 | -1 | volvo 145e (sw) | gas | std | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 9.5 | 114 | 5400 | 23 | 28 | 16845.0 |
| 201 | 202 | -1 | volvo 144ea | gas | turbo | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 8.7 | 160 | 5300 | 19 | 25 | 19045.0 |
| 202 | 203 | -1 | volvo 244dl | gas | std | four | sedan | rwd | front | 109.1 | ... | 173 | mpfi | 3.58 | 2.87 | 8.8 | 134 | 5500 | 18 | 23 | 21485.0 |
| 203 | 204 | -1 | volvo 246 | diesel | turbo | four | sedan | rwd | front | 109.1 | ... | 145 | idi | 3.01 | 3.40 | 23.0 | 106 | 4800 | 26 | 27 | 22470.0 |
| 204 | 205 | -1 | volvo 264gl | gas | turbo | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 9.5 | 114 | 5400 | 19 | 25 | 22625.0 |
205 rows × 26 columns